In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

import seaborn as sns
import plotly.express as px #for visualization
import matplotlib.pyplot as plt #for visualization 
In [2]:
df100=pd.read_csv("C:\\Users\\2001n\\OneDrive\\Documents\\CHURN_DATASET.csv")
df100
Out[2]:
customerID gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity ... DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 7590-VHVEG Female 0 Yes No 1 No No phone service DSL No ... No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 5575-GNVDE Male 0 No No 34 Yes No DSL Yes ... Yes No No No One year No Mailed check 56.95 1889.5 No
2 3668-QPYBK Male 0 No No 2 Yes No DSL Yes ... No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 7795-CFOCW Male 0 No No 45 No No phone service DSL Yes ... Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 9237-HQITU Female 0 No No 2 Yes No Fiber optic No ... No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 6840-RESVB Male 0 Yes Yes 24 Yes Yes DSL Yes ... Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.5 No
7039 2234-XADUH Female 0 Yes Yes 72 Yes Yes Fiber optic No ... Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.9 No
7040 4801-JZAZL Female 0 Yes Yes 11 No No phone service DSL Yes ... No No No No Month-to-month Yes Electronic check 29.60 346.45 No
7041 8361-LTMKD Male 1 Yes No 4 Yes Yes Fiber optic No ... No No No No Month-to-month Yes Mailed check 74.40 306.6 Yes
7042 3186-AJIEK Male 0 No No 66 Yes No Fiber optic Yes ... Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.5 No

7043 rows × 21 columns

In [3]:
df=df100.drop(["customerID"],axis = 1)
df
Out[3]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
0 Female 0 Yes No 1 No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No
1 Male 0 No No 34 Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.5 No
2 Male 0 No No 2 Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes
3 Male 0 No No 45 No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No
4 Female 0 No No 2 Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 Male 0 Yes Yes 24 Yes Yes DSL Yes No Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.5 No
7039 Female 0 Yes Yes 72 Yes Yes Fiber optic No Yes Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.9 No
7040 Female 0 Yes Yes 11 No No phone service DSL Yes No No No No No Month-to-month Yes Electronic check 29.60 346.45 No
7041 Male 1 Yes No 4 Yes Yes Fiber optic No No No No No No Month-to-month Yes Mailed check 74.40 306.6 Yes
7042 Male 0 No No 66 Yes No Fiber optic Yes No Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.5 No

7043 rows × 20 columns

In [4]:
df.shape
Out[4]:
(7043, 20)
In [5]:
df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7043 non-null   object 
 1   SeniorCitizen     7043 non-null   int64  
 2   Partner           7043 non-null   object 
 3   Dependents        7043 non-null   object 
 4   tenure            7043 non-null   int64  
 5   PhoneService      7043 non-null   object 
 6   MultipleLines     7043 non-null   object 
 7   InternetService   7043 non-null   object 
 8   OnlineSecurity    7043 non-null   object 
 9   OnlineBackup      7043 non-null   object 
 10  DeviceProtection  7043 non-null   object 
 11  TechSupport       7043 non-null   object 
 12  StreamingTV       7043 non-null   object 
 13  StreamingMovies   7043 non-null   object 
 14  Contract          7043 non-null   object 
 15  PaperlessBilling  7043 non-null   object 
 16  PaymentMethod     7043 non-null   object 
 17  MonthlyCharges    7043 non-null   float64
 18  TotalCharges      7043 non-null   object 
 19  Churn             7043 non-null   object 
dtypes: float64(1), int64(2), object(17)
memory usage: 1.1+ MB
In [6]:
def dataoveriew(df, message):
    
    print(f'{message}:n')  # it contains expressions in the braces
   
    print('Number of rows: '   , df.shape[0])
    print("Number of features:", df.shape[1])
    print(df.columns.tolist())
    print("Missing values:", df.isnull().sum().values.sum())
    print("Unique values:")
    print(df.nunique())

dataoveriew(df, 'Overview of the dataset')
Overview of the dataset:n
Number of rows:  7043
Number of features: 20
['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'tenure', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn']
Missing values: 0
Unique values:
gender                 2
SeniorCitizen          2
Partner                2
Dependents             2
tenure                73
PhoneService           2
MultipleLines          3
InternetService        3
OnlineSecurity         3
OnlineBackup           3
DeviceProtection       3
TechSupport            3
StreamingTV            3
StreamingMovies        3
Contract               3
PaperlessBilling       2
PaymentMethod          4
MonthlyCharges      1585
TotalCharges        6531
Churn                  2
dtype: int64
In [7]:
missing = pd.DataFrame((df.isnull().sum())*100/df.shape[0]).reset_index()

plt.figure(figsize=(16,5))

sns.pointplot(x ='index',y = 0,data=missing)

plt.xticks(rotation =90,fontsize =7)

plt.title("Percentage of Missing values")
plt.ylabel("PERCENTAGE")

plt.show()
No description has been provided for this image
In [8]:
df.TotalCharges = pd.to_numeric(df.TotalCharges, errors='coerce')
df.isnull().sum()
Out[8]:
gender               0
SeniorCitizen        0
Partner              0
Dependents           0
tenure               0
PhoneService         0
MultipleLines        0
InternetService      0
OnlineSecurity       0
OnlineBackup         0
DeviceProtection     0
TechSupport          0
StreamingTV          0
StreamingMovies      0
Contract             0
PaperlessBilling     0
PaymentMethod        0
MonthlyCharges       0
TotalCharges        11
Churn                0
dtype: int64
In [9]:
df.loc[df['TotalCharges'].isnull() == True]
Out[9]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn
488 Female 0 Yes Yes 0 No No phone service DSL Yes No Yes Yes Yes No Two year Yes Bank transfer (automatic) 52.55 NaN No
753 Male 0 No Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.25 NaN No
936 Female 0 Yes Yes 0 Yes No DSL Yes Yes Yes No Yes Yes Two year No Mailed check 80.85 NaN No
1082 Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.75 NaN No
1340 Female 0 Yes Yes 0 No No phone service DSL Yes Yes Yes Yes Yes No Two year No Credit card (automatic) 56.05 NaN No
3331 Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 19.85 NaN No
3826 Male 0 Yes Yes 0 Yes Yes No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 25.35 NaN No
4380 Female 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service Two year No Mailed check 20.00 NaN No
5218 Male 0 Yes Yes 0 Yes No No No internet service No internet service No internet service No internet service No internet service No internet service One year Yes Mailed check 19.70 NaN No
6670 Female 0 Yes Yes 0 Yes Yes DSL No Yes Yes Yes Yes No Two year No Mailed check 73.35 NaN No
6754 Male 0 No Yes 0 Yes Yes DSL Yes Yes No Yes No No Two year Yes Bank transfer (automatic) 61.90 NaN No
In [10]:
df.dropna(how = 'any', inplace = True)
In [11]:
df.info()
<class 'pandas.core.frame.DataFrame'>
Index: 7032 entries, 0 to 7042
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   gender            7032 non-null   object 
 1   SeniorCitizen     7032 non-null   int64  
 2   Partner           7032 non-null   object 
 3   Dependents        7032 non-null   object 
 4   tenure            7032 non-null   int64  
 5   PhoneService      7032 non-null   object 
 6   MultipleLines     7032 non-null   object 
 7   InternetService   7032 non-null   object 
 8   OnlineSecurity    7032 non-null   object 
 9   OnlineBackup      7032 non-null   object 
 10  DeviceProtection  7032 non-null   object 
 11  TechSupport       7032 non-null   object 
 12  StreamingTV       7032 non-null   object 
 13  StreamingMovies   7032 non-null   object 
 14  Contract          7032 non-null   object 
 15  PaperlessBilling  7032 non-null   object 
 16  PaymentMethod     7032 non-null   object 
 17  MonthlyCharges    7032 non-null   float64
 18  TotalCharges      7032 non-null   float64
 19  Churn             7032 non-null   object 
dtypes: float64(2), int64(2), object(16)
memory usage: 1.1+ MB
In [12]:
df.shape
Out[12]:
(7032, 20)
In [13]:
df1 = df["Churn"].value_counts().reset_index()  # Reset index to create columns, Defult index
df1.columns = ['Category', 'count']  # Rename columns explicitly

df1
Out[13]:
Category count
0 No 5163
1 Yes 1869
In [14]:
df1 = df["Churn"].value_counts().reset_index()  # Reset index to create columns, Defult index
df1.columns = ['Category', 'count']  # Rename columns explicitly

fig = px.pie(df1, values='count', names='Category',color_discrete_sequence=["green", "red"],
            title='Distribution of Churn')
fig.show()
In [15]:
df2 = df.copy()
In [16]:
labels = ["{0} - {1}".format(i, i + 11) for i in range(1, 72, 12)]

df2['tenure_group'] = pd.cut(df.tenure, range(1, 80, 12), right=False, labels=labels)
In [17]:
df2.tenure_group.value_counts()
Out[17]:
tenure_group
1 - 12     2175
61 - 72    1407
13 - 24    1024
25 - 36     832
49 - 60     832
37 - 48     762
Name: count, dtype: int64
In [18]:
df2
Out[18]:
gender SeniorCitizen Partner Dependents tenure PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup ... TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn tenure_group
0 Female 0 Yes No 1 No No phone service DSL No Yes ... No No No Month-to-month Yes Electronic check 29.85 29.85 No 1 - 12
1 Male 0 No No 34 Yes No DSL Yes No ... No No No One year No Mailed check 56.95 1889.50 No 25 - 36
2 Male 0 No No 2 Yes No DSL Yes Yes ... No No No Month-to-month Yes Mailed check 53.85 108.15 Yes 1 - 12
3 Male 0 No No 45 No No phone service DSL Yes No ... Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No 37 - 48
4 Female 0 No No 2 Yes No Fiber optic No No ... No No No Month-to-month Yes Electronic check 70.70 151.65 Yes 1 - 12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 Male 0 Yes Yes 24 Yes Yes DSL Yes No ... Yes Yes Yes One year Yes Mailed check 84.80 1990.50 No 13 - 24
7039 Female 0 Yes Yes 72 Yes Yes Fiber optic No Yes ... No Yes Yes One year Yes Credit card (automatic) 103.20 7362.90 No 61 - 72
7040 Female 0 Yes Yes 11 No No phone service DSL Yes No ... No No No Month-to-month Yes Electronic check 29.60 346.45 No 1 - 12
7041 Male 1 Yes No 4 Yes Yes Fiber optic No No ... No No No Month-to-month Yes Mailed check 74.40 306.60 Yes 1 - 12
7042 Male 0 No No 66 Yes No Fiber optic Yes No ... Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.50 No 61 - 72

7032 rows × 21 columns

In [19]:
df2.shape
Out[19]:
(7032, 21)
In [ ]:
 
In [20]:
import warnings
warnings.filterwarnings('ignore')
In [21]:
df2.drop(columns= ['tenure'], axis=1, inplace=True)
df2
Out[21]:
gender SeniorCitizen Partner Dependents PhoneService MultipleLines InternetService OnlineSecurity OnlineBackup DeviceProtection TechSupport StreamingTV StreamingMovies Contract PaperlessBilling PaymentMethod MonthlyCharges TotalCharges Churn tenure_group
0 Female 0 Yes No No No phone service DSL No Yes No No No No Month-to-month Yes Electronic check 29.85 29.85 No 1 - 12
1 Male 0 No No Yes No DSL Yes No Yes No No No One year No Mailed check 56.95 1889.50 No 25 - 36
2 Male 0 No No Yes No DSL Yes Yes No No No No Month-to-month Yes Mailed check 53.85 108.15 Yes 1 - 12
3 Male 0 No No No No phone service DSL Yes No Yes Yes No No One year No Bank transfer (automatic) 42.30 1840.75 No 37 - 48
4 Female 0 No No Yes No Fiber optic No No No No No No Month-to-month Yes Electronic check 70.70 151.65 Yes 1 - 12
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 Male 0 Yes Yes Yes Yes DSL Yes No Yes Yes Yes Yes One year Yes Mailed check 84.80 1990.50 No 13 - 24
7039 Female 0 Yes Yes Yes Yes Fiber optic No Yes Yes No Yes Yes One year Yes Credit card (automatic) 103.20 7362.90 No 61 - 72
7040 Female 0 Yes Yes No No phone service DSL Yes No No No No No Month-to-month Yes Electronic check 29.60 346.45 No 1 - 12
7041 Male 1 Yes No Yes Yes Fiber optic No No No No No No Month-to-month Yes Mailed check 74.40 306.60 Yes 1 - 12
7042 Male 0 No No Yes No Fiber optic Yes No Yes Yes Yes Yes Two year Yes Bank transfer (automatic) 105.65 6844.50 No 61 - 72

7032 rows × 20 columns

In [22]:
for i, predictor in enumerate(df2.drop(columns=['Churn', 'TotalCharges', 'MonthlyCharges'])):
    plt.figure(i)
    plt.xticks(rotation = 90)
    sns.countplot(data=df2, x=predictor, hue='Churn')
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
No description has been provided for this image
In [ ]:
 
In [ ]:
 
In [23]:
def hist(feature):
   
    df2 = df.groupby([feature, 'Churn']).size().reset_index()
    df2 = df2.rename(columns={0: 'Count'})  # Corrected line
    
    fig = px.histogram(df2, x=feature, y='Count', color='Churn', marginal='box'
                       , title=('Churn rate frequency to distribution')
                       , color_discrete_sequence=["green", "red"])
    fig.show()
In [24]:
hist('tenure')
hist('MonthlyCharges')
hist('TotalCharges')
In [136]:
bin_df = pd.DataFrame()

#Update the binning dataframe

bin_df['tenure_bins'] =  pd.qcut(df['tenure'], q=3, labels= ['low', 'medium', 'high'])

bin_df['MonthlyCharges_bins'] =  pd.qcut(df['MonthlyCharges'], q=3, labels= ['low', 'medium', 'high'])

bin_df['TotalCharges_bins'] =  pd.qcut(df['TotalCharges'], q=3, labels= ['low', 'medium', 'high'])

bin_df['Churn'] = df['Churn']
In [138]:
def plot_bar(feature, data):
    fig = px.bar(data.groupby([feature, 'Churn']).size().reset_index(), 
                 x=feature, y=0, color='Churn', 
                 title=f'Churn rate frequency to {feature} bins', 
                 barmode='group',
                 color_discrete_sequence=["green", "red"])
    fig.show()

plot_bar('tenure_bins', bin_df)
plot_bar('MonthlyCharges_bins', bin_df)
plot_bar('TotalCharges_bins', bin_df)

DATA PREPROCESSING

In [27]:
from ydata_profiling import ProfileReport
profile = ProfileReport(df, title="Profiling Report")
In [28]:
profile
Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]
Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]
Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]
Out[28]:

In [29]:
from sklearn import preprocessing
In [30]:
df3 = pd.get_dummies(df2, columns=['gender','Partner','Dependents','tenure_group','PhoneService','MultipleLines','InternetService','OnlineSecurity','OnlineBackup','DeviceProtection','TechSupport','StreamingTV','StreamingMovies','Contract','PaperlessBilling','PaymentMethod','Churn'])
In [31]:
df3
Out[31]:
SeniorCitizen MonthlyCharges TotalCharges gender_Female gender_Male Partner_No Partner_Yes Dependents_No Dependents_Yes tenure_group_1 - 12 ... Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check Churn_No Churn_Yes
0 0 29.85 29.85 True False False True True False True ... False False False True False False True False True False
1 0 56.95 1889.50 False True True False True False False ... True False True False False False False True True False
2 0 53.85 108.15 False True True False True False True ... False False False True False False False True False True
3 0 42.30 1840.75 False True True False True False False ... True False True False True False False False True False
4 0 70.70 151.65 True False True False True False True ... False False False True False False True False False True
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 84.80 1990.50 False True False True False True False ... True False False True False False False True True False
7039 0 103.20 7362.90 True False False True False True False ... True False False True False True False False True False
7040 0 29.60 346.45 True False False True False True True ... False False False True False False True False True False
7041 1 74.40 306.60 False True False True True False True ... False False False True False False False True False True
7042 0 105.65 6844.50 False True True False True False False ... False True False True True False False False True False

7032 rows × 52 columns

In [32]:
df4 = df3.replace({True :1,False: 0})
In [33]:
df4
Out[33]:
SeniorCitizen MonthlyCharges TotalCharges gender_Female gender_Male Partner_No Partner_Yes Dependents_No Dependents_Yes tenure_group_1 - 12 ... Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check Churn_No Churn_Yes
0 0 29.85 29.85 1 0 0 1 1 0 1 ... 0 0 0 1 0 0 1 0 1 0
1 0 56.95 1889.50 0 1 1 0 1 0 0 ... 1 0 1 0 0 0 0 1 1 0
2 0 53.85 108.15 0 1 1 0 1 0 1 ... 0 0 0 1 0 0 0 1 0 1
3 0 42.30 1840.75 0 1 1 0 1 0 0 ... 1 0 1 0 1 0 0 0 1 0
4 0 70.70 151.65 1 0 1 0 1 0 1 ... 0 0 0 1 0 0 1 0 0 1
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 84.80 1990.50 0 1 0 1 0 1 0 ... 1 0 0 1 0 0 0 1 1 0
7039 0 103.20 7362.90 1 0 0 1 0 1 0 ... 1 0 0 1 0 1 0 0 1 0
7040 0 29.60 346.45 1 0 0 1 0 1 1 ... 0 0 0 1 0 0 1 0 1 0
7041 1 74.40 306.60 0 1 0 1 1 0 1 ... 0 0 0 1 0 0 0 1 0 1
7042 0 105.65 6844.50 0 1 1 0 1 0 0 ... 0 1 0 1 1 0 0 0 1 0

7032 rows × 52 columns

In [34]:
# Step 1: Exclude non-numeric columns
numeric_df = df4.select_dtypes(include=['number'])

# Step 2: Check for non-numeric columns with string values
non_numeric_cols = df4.columns.difference(numeric_df.columns)

# If there are non-numeric columns with string values, you can drop them or convert them to numeric if possible.
# For example, to drop them:
df4 = df4.drop(columns=non_numeric_cols)

# Now, you can calculate the correlation matrix
c_matrix = df4.corr()
In [35]:
c_matrix
Out[35]:
SeniorCitizen MonthlyCharges TotalCharges gender_Female gender_Male Partner_No Partner_Yes Dependents_No Dependents_Yes tenure_group_1 - 12 ... Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check Churn_No Churn_Yes
SeniorCitizen 1.000000 0.219874 0.102411 0.001819 -0.001819 -0.016957 0.016957 0.210550 -0.210550 -0.027713 ... -0.046491 -0.116205 -0.156258 0.156258 -0.016235 -0.024359 0.171322 -0.152987 -0.150541 0.150541
MonthlyCharges 0.219874 1.000000 0.651065 0.013779 -0.013779 -0.097825 0.097825 0.112343 -0.112343 -0.191881 ... 0.004810 -0.073256 -0.351930 0.351930 0.042410 0.030055 0.271117 -0.376568 -0.192858 0.192858
TotalCharges 0.102411 0.651065 1.000000 -0.000048 0.000048 -0.319072 0.319072 -0.064653 0.064653 -0.592443 ... 0.170569 0.358036 -0.157830 0.157830 0.186119 0.182663 -0.060436 -0.294708 0.199484 -0.199484
gender_Female 0.001819 0.013779 -0.000048 1.000000 -1.000000 -0.001379 0.001379 0.010349 -0.010349 0.001050 ... -0.007755 0.003603 -0.011902 0.011902 0.015973 -0.001632 -0.000844 -0.013199 -0.008545 0.008545
gender_Male -0.001819 -0.013779 0.000048 -1.000000 1.000000 0.001379 -0.001379 -0.010349 0.010349 -0.001050 ... 0.007755 -0.003603 0.011902 -0.011902 -0.015973 0.001632 0.000844 0.013199 0.008545 -0.008545
Partner_No -0.016957 -0.097825 -0.319072 -0.001379 0.001379 1.000000 -1.000000 0.452269 -0.452269 0.305061 ... -0.083067 -0.247334 -0.013957 0.013957 -0.111406 -0.082327 0.083207 0.096948 -0.149982 0.149982
Partner_Yes 0.016957 0.097825 0.319072 0.001379 -0.001379 -1.000000 1.000000 -0.452269 0.452269 -0.305061 ... 0.083067 0.247334 0.013957 -0.013957 0.111406 0.082327 -0.083207 -0.096948 0.149982 -0.149982
Dependents_No 0.210550 0.112343 -0.064653 0.010349 -0.010349 0.452269 -0.452269 1.000000 -1.000000 0.145379 ... -0.069222 -0.201699 -0.110131 0.110131 -0.052369 -0.061134 0.149274 -0.056448 -0.163128 0.163128
Dependents_Yes -0.210550 -0.112343 0.064653 -0.010349 0.010349 -0.452269 0.452269 -1.000000 1.000000 -0.145379 ... 0.069222 0.201699 0.110131 -0.110131 0.052369 0.061134 -0.149274 0.056448 0.163128 -0.163128
tenure_group_1 - 12 -0.027713 -0.191881 -0.592443 0.001050 -0.001050 0.305061 -0.305061 0.145379 -0.145379 1.000000 ... -0.251299 -0.333850 0.003860 -0.003860 -0.185855 -0.184165 0.160530 0.183222 -0.319628 0.319628
tenure_group_13 - 24 0.001860 -0.047220 -0.210745 0.000649 -0.000649 0.048481 -0.048481 0.001459 -0.001459 -0.276268 ... -0.017196 -0.146749 -0.003328 0.003328 -0.046329 -0.039647 0.030387 0.050371 -0.019929 0.019929
tenure_group_25 - 36 0.027317 0.009465 -0.047370 -0.006249 0.006249 -0.003131 0.003131 -0.009289 0.009289 -0.245138 ... 0.082077 -0.106618 0.005504 -0.005504 -0.000472 0.008599 0.003897 -0.012360 0.040997 -0.040997
tenure_group_37 - 48 -0.000929 0.017614 0.083696 0.020658 -0.020658 -0.035092 0.035092 -0.023544 0.023544 -0.233286 ... 0.122003 -0.004919 0.001538 -0.001538 0.029750 0.029093 -0.019634 -0.035775 0.059579 -0.059579
tenure_group_49 - 60 0.014186 0.070048 0.252905 0.004319 -0.004319 -0.105341 0.105341 -0.031419 0.031419 -0.245138 ... 0.158917 0.080082 -0.010626 0.010626 0.060183 0.048167 -0.030584 -0.072170 0.100800 -0.100800
tenure_group_61 - 72 -0.002407 0.185440 0.639312 -0.016279 0.016279 -0.280353 0.280353 -0.118090 0.118090 -0.334681 ... 0.016142 0.540336 0.001415 -0.001415 0.184249 0.179294 -0.175456 -0.160053 0.226078 -0.226078
PhoneService_No -0.008392 -0.248033 -0.113008 -0.007515 0.007515 0.018397 -0.018397 -0.001078 0.001078 0.006950 ... 0.003142 -0.004442 0.016696 -0.016696 -0.008271 0.006916 -0.002747 0.004463 0.011691 -0.011691
PhoneService_Yes 0.008392 0.248033 0.113008 0.007515 -0.007515 -0.018397 0.018397 0.001078 -0.001078 -0.006950 ... -0.003142 0.004442 -0.016696 0.016696 0.008271 -0.006916 0.002747 -0.004463 -0.011691 0.011691
MultipleLines_No -0.136377 -0.338514 -0.396765 -0.004335 0.004335 0.130028 -0.130028 -0.023388 0.023388 0.256171 ... 0.001694 -0.102756 0.151974 -0.151974 -0.069663 -0.063712 -0.080990 0.222395 0.032654 -0.032654
MultipleLines_No phone service -0.008392 -0.248033 -0.113008 -0.007515 0.007515 0.018397 -0.018397 -0.001078 0.001078 0.006950 ... 0.003142 -0.004442 0.016696 -0.016696 -0.008271 0.006916 -0.002747 0.004463 0.011691 -0.011691
MultipleLines_Yes 0.142996 0.490912 0.469042 0.008883 -0.008883 -0.142561 0.142561 0.024307 -0.024307 -0.263331 ... -0.003594 0.106618 -0.163746 0.163746 0.075429 0.060319 0.083583 -0.227672 -0.040033 0.040033
InternetService_DSL -0.108276 -0.161368 -0.052190 -0.007584 0.007584 0.001043 -0.001043 -0.051593 0.051593 -0.001470 ... 0.047300 0.030924 0.063390 -0.063390 0.024760 0.051222 -0.104293 0.042754 0.124141 -0.124141
InternetService_Fiber optic 0.254923 0.787195 0.360769 0.011189 -0.011189 -0.001235 0.001235 0.164101 -0.164101 -0.021441 ... -0.076809 -0.209965 -0.326470 0.326470 -0.022779 -0.050552 0.335763 -0.305984 -0.307463 0.307463
InternetService_No -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
OnlineSecurity_No 0.185145 0.360220 -0.064515 -0.010859 0.010859 0.129394 -0.129394 0.186979 -0.186979 0.196529 ... -0.122360 -0.352447 -0.267592 0.267592 -0.084436 -0.105963 0.335854 -0.190919 -0.342235 0.342235
OnlineSecurity_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
OnlineSecurity_Yes -0.038576 0.296447 0.412619 0.016328 -0.016328 -0.143346 0.143346 -0.080786 0.080786 -0.242409 ... 0.100658 0.191698 0.004051 -0.004051 0.094366 0.115473 -0.112295 -0.079918 0.171270 -0.171270
OnlineBackup_No 0.087539 0.210126 -0.177633 -0.008605 0.008605 0.135626 -0.135626 0.137421 -0.137421 0.233228 ... -0.112133 -0.287128 -0.144218 0.144218 -0.082365 -0.088189 0.236414 -0.098438 -0.267595 0.267595
OnlineBackup_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
OnlineBackup_Yes 0.066663 0.441529 0.510100 0.013093 -0.013093 -0.141849 0.141849 -0.023639 0.023639 -0.267366 ... 0.084113 0.111391 -0.127056 0.127056 0.086942 0.090455 -0.000364 -0.174075 0.082307 -0.082307
DeviceProtection_No 0.094403 0.171057 -0.189485 0.003163 -0.003163 0.146702 -0.146702 0.128053 -0.128053 0.239267 ... -0.130038 -0.338520 -0.166253 0.166253 -0.078561 -0.108008 0.239173 -0.085850 -0.252056 0.252056
DeviceProtection_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
DeviceProtection_Yes 0.059514 0.482607 0.522881 0.000807 -0.000807 -0.153556 0.153556 -0.013900 0.013900 -0.273920 ... 0.102911 0.165248 -0.104079 0.104079 0.083047 0.111252 -0.003308 -0.187325 0.066193 -0.066193
TechSupport_No 0.205254 0.321267 -0.084270 -0.003815 0.003815 0.108875 -0.108875 0.171164 -0.171164 0.193915 ... -0.118709 -0.397788 -0.229875 0.229875 -0.090296 -0.107761 0.338529 -0.186388 -0.336877 0.336877
TechSupport_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
TechSupport_Yes -0.060577 0.338301 0.432868 0.008507 -0.008507 -0.120206 0.120206 -0.063053 0.063053 -0.238628 ... 0.096258 0.240924 -0.037536 0.037536 0.100472 0.117024 -0.114807 -0.084631 0.164716 -0.164716
StreamingTV_No 0.048664 0.016015 -0.197144 -0.003088 0.003088 0.123394 -0.123394 0.099912 -0.099912 0.196100 ... -0.093495 -0.254456 -0.046715 0.046715 -0.044887 -0.041309 0.095426 -0.022650 -0.128435 0.128435
StreamingTV_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
StreamingTV_Yes 0.105445 0.629668 0.515709 0.007124 -0.007124 -0.124483 0.124483 0.016499 -0.016499 -0.220761 ... 0.061930 0.072124 -0.224241 0.224241 0.046121 0.040010 0.144747 -0.247712 -0.063254 0.063254
StreamingMovies_No 0.034196 0.017271 -0.202605 -0.006078 0.006078 0.117488 -0.117488 0.078245 -0.078245 0.197479 ... -0.096613 -0.258495 -0.058987 0.058987 -0.047677 -0.049817 0.102617 -0.019648 -0.130920 0.130920
StreamingMovies_No internet service -0.182519 -0.763191 -0.374878 -0.004745 0.004745 0.000286 -0.000286 -0.138383 0.138383 0.027554 ... 0.038061 0.217542 0.320592 -0.320592 -0.001094 0.001870 -0.284608 0.319694 0.227578 -0.227578
StreamingMovies_Yes 0.119842 0.627235 0.519867 0.010105 -0.010105 -0.118108 0.118108 0.038375 -0.038375 -0.221388 ... 0.064780 0.075603 -0.211583 0.211583 0.048755 0.048398 0.137420 -0.250290 -0.060860 0.060860
Contract_Month-to-month 0.137752 0.058933 -0.446776 0.003251 -0.003251 0.280202 -0.280202 0.229715 -0.229715 0.492052 ... -0.570053 -0.621933 -0.168296 0.168296 -0.180159 -0.204960 0.330879 0.006209 -0.404565 0.404565
Contract_One year -0.046491 0.004810 0.170569 -0.007755 0.007755 -0.083067 0.083067 -0.069222 0.069222 -0.251299 ... 1.000000 -0.288843 0.052278 -0.052278 0.057629 0.067590 -0.109546 0.000197 0.178225 -0.178225
Contract_Two year -0.116205 -0.073256 0.358036 0.003603 -0.003603 -0.247334 0.247334 -0.201699 0.201699 -0.333850 ... -0.288843 1.000000 0.146281 -0.146281 0.155004 0.174410 -0.281147 -0.007423 0.301552 -0.301552
PaperlessBilling_No -0.156258 -0.351930 -0.157830 -0.011902 0.011902 -0.013957 0.013957 -0.110131 0.110131 0.003860 ... 0.052278 0.146281 1.000000 -1.000000 0.017469 0.013726 -0.208427 0.203981 0.191454 -0.191454
PaperlessBilling_Yes 0.156258 0.351930 0.157830 0.011902 -0.011902 0.013957 -0.013957 0.110131 -0.110131 -0.003860 ... -0.052278 -0.146281 -1.000000 1.000000 -0.017469 -0.013726 0.208427 -0.203981 -0.191454 0.191454
PaymentMethod_Bank transfer (automatic) -0.016235 0.042410 0.186119 0.015973 -0.015973 -0.111406 0.111406 -0.052369 0.052369 -0.185855 ... 0.057629 0.155004 0.017469 -0.017469 1.000000 -0.278423 -0.377270 -0.288097 0.118136 -0.118136
PaymentMethod_Credit card (automatic) -0.024359 0.030055 0.182663 -0.001632 0.001632 -0.082327 0.082327 -0.061134 0.061134 -0.184165 ... 0.067590 0.174410 0.013726 -0.013726 -0.278423 1.000000 -0.373978 -0.285583 0.134687 -0.134687
PaymentMethod_Electronic check 0.171322 0.271117 -0.060436 -0.000844 0.000844 0.083207 -0.083207 0.149274 -0.149274 0.160530 ... -0.109546 -0.281147 -0.208427 0.208427 -0.377270 -0.373978 1.000000 -0.386971 -0.301455 0.301455
PaymentMethod_Mailed check -0.152987 -0.376568 -0.294708 -0.013199 0.013199 0.096948 -0.096948 -0.056448 0.056448 0.183222 ... 0.000197 -0.007423 0.203981 -0.203981 -0.288097 -0.285583 -0.386971 1.000000 0.090773 -0.090773
Churn_No -0.150541 -0.192858 0.199484 -0.008545 0.008545 -0.149982 0.149982 -0.163128 0.163128 -0.319628 ... 0.178225 0.301552 0.191454 -0.191454 0.118136 0.134687 -0.301455 0.090773 1.000000 -1.000000
Churn_Yes 0.150541 0.192858 -0.199484 0.008545 -0.008545 0.149982 -0.149982 0.163128 -0.163128 0.319628 ... -0.178225 -0.301552 -0.191454 0.191454 -0.118136 -0.134687 0.301455 -0.090773 -1.000000 1.000000

52 rows × 52 columns

In [36]:
plt.figure(figsize = (12,12))
sns.heatmap(data = c_matrix,  cmap = "Paired")
Out[36]:
<Axes: >
No description has been provided for this image

SPLITTING THE DATASET

In [37]:
x = df4.drop(['Churn_Yes','Churn_No'],axis =1)
y = df4['Churn_Yes']
In [38]:
x
Out[38]:
SeniorCitizen MonthlyCharges TotalCharges gender_Female gender_Male Partner_No Partner_Yes Dependents_No Dependents_Yes tenure_group_1 - 12 ... StreamingMovies_Yes Contract_Month-to-month Contract_One year Contract_Two year PaperlessBilling_No PaperlessBilling_Yes PaymentMethod_Bank transfer (automatic) PaymentMethod_Credit card (automatic) PaymentMethod_Electronic check PaymentMethod_Mailed check
0 0 29.85 29.85 1 0 0 1 1 0 1 ... 0 1 0 0 0 1 0 0 1 0
1 0 56.95 1889.50 0 1 1 0 1 0 0 ... 0 0 1 0 1 0 0 0 0 1
2 0 53.85 108.15 0 1 1 0 1 0 1 ... 0 1 0 0 0 1 0 0 0 1
3 0 42.30 1840.75 0 1 1 0 1 0 0 ... 0 0 1 0 1 0 1 0 0 0
4 0 70.70 151.65 1 0 1 0 1 0 1 ... 0 1 0 0 0 1 0 0 1 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
7038 0 84.80 1990.50 0 1 0 1 0 1 0 ... 1 0 1 0 0 1 0 0 0 1
7039 0 103.20 7362.90 1 0 0 1 0 1 0 ... 1 0 1 0 0 1 0 1 0 0
7040 0 29.60 346.45 1 0 0 1 0 1 1 ... 0 1 0 0 0 1 0 0 1 0
7041 1 74.40 306.60 0 1 0 1 1 0 1 ... 0 1 0 0 0 1 0 0 0 1
7042 0 105.65 6844.50 0 1 1 0 1 0 0 ... 1 0 0 1 0 1 1 0 0 0

7032 rows × 50 columns

In [39]:
y
Out[39]:
0       0
1       0
2       1
3       0
4       1
       ..
7038    0
7039    0
7040    0
7041    1
7042    0
Name: Churn_Yes, Length: 7032, dtype: int64
In [40]:
 from sklearn.model_selection import train_test_split
In [41]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size = 0.2,random_state = 10)
In [42]:
len(x_train)
Out[42]:
5625
In [43]:
len(x_test)
Out[43]:
1407
In [44]:
from sklearn.metrics import mean_squared_error,confusion_matrix,classification_report,accuracy_score
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score,mean_absolute_error
from sklearn.metrics import r2_score,mean_squared_error
In [45]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
In [46]:
model.fit(x_train,y_train)
Out[46]:
LinearRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LinearRegression()
In [47]:
model.score(x_test,y_test) # Accuracy
Out[47]:
0.3084976908935517
In [48]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [49]:
model.predict(x_test)
Out[49]:
array([-0.07128335,  0.02163858,  0.14581771, ..., -0.0334702 ,
        0.19590921,  0.33938377])
In [50]:
y_test
Out[50]:
5401    0
1681    0
2076    0
3359    0
6629    0
       ..
1986    0
5608    0
6400    0
5560    0
3853    0
Name: Churn_Yes, Length: 1407, dtype: int64
In [51]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
In [52]:
model.fit(x_train,y_train)
Out[52]:
LogisticRegression()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
LogisticRegression()
In [53]:
model.score(x_test,y_test)
Out[53]:
0.8130774697938877
In [54]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [55]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[55]:
array([[944, 102],
       [161, 200]], dtype=int64)
In [56]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[56]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [57]:
from sklearn import tree
In [58]:
model = tree.DecisionTreeClassifier()
In [59]:
model.fit(x_train,y_train)
Out[59]:
DecisionTreeClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
DecisionTreeClassifier()
In [60]:
model.score(x_test,y_test)
Out[60]:
0.7320540156361052
In [61]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [62]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[62]:
array([[825, 221],
       [156, 205]], dtype=int64)
In [63]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.84      0.79      0.81      1046
           1       0.48      0.57      0.52       361

    accuracy                           0.73      1407
   macro avg       0.66      0.68      0.67      1407
weighted avg       0.75      0.73      0.74      1407

In [64]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[64]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [65]:
from sklearn.svm import SVC
model = SVC()
In [66]:
model.fit(x_train,y_train)
Out[66]:
SVC()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
SVC()
In [67]:
model.score(x_test,y_test)
Out[67]:
0.7434257285003554
In [68]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [69]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[69]:
array([[1046,    0],
       [ 361,    0]], dtype=int64)
In [70]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.74      1.00      0.85      1046
           1       0.00      0.00      0.00       361

    accuracy                           0.74      1407
   macro avg       0.37      0.50      0.43      1407
weighted avg       0.55      0.74      0.63      1407

In [71]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[71]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [72]:
# multiple algo to predict outcome
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier()
In [73]:
model.fit(x_train,y_train)
Out[73]:
RandomForestClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
RandomForestClassifier()
In [74]:
model.score(x_test,y_test)
Out[74]:
0.7931769722814499
In [75]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [76]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[76]:
array([[919, 127],
       [164, 197]], dtype=int64)
In [77]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.85      0.88      0.86      1046
           1       0.61      0.55      0.58       361

    accuracy                           0.79      1407
   macro avg       0.73      0.71      0.72      1407
weighted avg       0.79      0.79      0.79      1407

In [78]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[78]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [79]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
In [80]:
model.fit(x_train,y_train)
Out[80]:
GaussianNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GaussianNB()
In [81]:
model.score(x_test,y_test)
Out[81]:
0.689410092395167
In [82]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [83]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[83]:
array([[664, 382],
       [ 55, 306]], dtype=int64)
In [84]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.92      0.63      0.75      1046
           1       0.44      0.85      0.58       361

    accuracy                           0.69      1407
   macro avg       0.68      0.74      0.67      1407
weighted avg       0.80      0.69      0.71      1407

In [85]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[85]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [86]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
In [87]:
model.fit(x_train,y_train)
Out[87]:
MultinomialNB()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MultinomialNB()
In [88]:
model.score(x_test,y_test)
Out[88]:
0.658137882018479
In [89]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [90]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[90]:
array([[637, 409],
       [ 72, 289]], dtype=int64)
In [91]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.90      0.61      0.73      1046
           1       0.41      0.80      0.55       361

    accuracy                           0.66      1407
   macro avg       0.66      0.70      0.64      1407
weighted avg       0.77      0.66      0.68      1407

In [92]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[92]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [93]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
In [94]:
scaler.fit(x_test,y_test)
Out[94]:
StandardScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
StandardScaler()
In [95]:
scaler.fit_transform(x)
Out[95]:
array([[-0.44032709, -1.16169394, -0.99419409, ..., -0.5253508 ,
         1.40476387, -0.54360352],
       [-0.44032709, -0.26087792, -0.17373982, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       [-0.44032709, -0.36392329, -0.95964911, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       ...,
       [-0.44032709, -1.17000405, -0.85451414, ..., -0.5253508 ,
         1.40476387, -0.54360352],
       [ 2.27103902,  0.31916782, -0.87209546, ..., -0.5253508 ,
        -0.71186341,  1.83957601],
       [-0.44032709,  1.35793167,  2.01234407, ..., -0.5253508 ,
        -0.71186341, -0.54360352]])
In [96]:
from sklearn.preprocessing import MinMaxScaler
model = MinMaxScaler()
In [97]:
model.fit(x_train,y_train)
Out[97]:
MinMaxScaler()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
MinMaxScaler()
In [98]:
model.fit_transform(x)
Out[98]:
array([[0.        , 0.11542289, 0.0012751 , ..., 0.        , 1.        ,
        0.        ],
       [0.        , 0.38507463, 0.21586661, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.35422886, 0.01031041, ..., 0.        , 0.        ,
        1.        ],
       ...,
       [0.        , 0.11293532, 0.03780868, ..., 0.        , 1.        ,
        0.        ],
       [1.        , 0.55870647, 0.03321025, ..., 0.        , 0.        ,
        1.        ],
       [0.        , 0.86965174, 0.78764136, ..., 0.        , 0.        ,
        0.        ]])
In [99]:
from sklearn.decomposition import PCA
model = PCA()
In [100]:
model.fit(x_train,y_train)
Out[100]:
PCA()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
PCA()
In [101]:
model.fit_transform(x)
Out[101]:
array([[-2.25366835e+03, -1.54045960e+01,  2.41138473e+00, ...,
        -1.51660698e-13, -5.75917529e-14,  8.49111474e-14],
       [-3.93853721e+02, -4.44308307e+00,  1.11164960e+00, ...,
        -1.39011273e-13, -5.43819132e-13, -2.50122100e-13],
       [-2.17516400e+03,  7.87670140e+00,  1.15303139e+00, ...,
         4.61012805e-13, -3.30919668e-13, -3.11723294e-13],
       ...,
       [-1.93708238e+03, -1.84016387e+01,  2.46992124e+00, ...,
        -8.50874545e-14,  2.89204519e-14, -1.14256834e-14],
       [-1.97654365e+03,  2.67272984e+01,  2.46058887e-01, ...,
        -9.49510228e-14,  9.17079680e-14, -3.79921907e-14],
       [ 4.56138245e+03,  1.41903237e+00, -4.40312995e-01, ...,
        -1.24732051e-13,  9.92906304e-14, -1.56826950e-14]])
In [102]:
from sklearn.linear_model import LogisticRegression
lr=LogisticRegression()
model=lr.fit(x_train,y_train)
pred=model.predict(x_test)
pred
Out[102]:
array([0, 0, 0, ..., 0, 0, 0], dtype=int64)
In [103]:
from sklearn.metrics import mean_squared_error,confusion_matrix,classification_report,accuracy_score
from sklearn.metrics import roc_curve,roc_auc_score,accuracy_score,mean_absolute_error
from sklearn.metrics import r2_score,mean_squared_error
In [104]:
print('classification_report')
print(classification_report(y_test,pred))
classification_report
              precision    recall  f1-score   support

           0       0.85      0.90      0.88      1046
           1       0.66      0.55      0.60       361

    accuracy                           0.81      1407
   macro avg       0.76      0.73      0.74      1407
weighted avg       0.81      0.81      0.81      1407

In [105]:
print('accuracy_score')
print(accuracy_score(y_test,pred))
accuracy_score
0.8130774697938877
In [106]:
print('confusion_matrix')
print(confusion_matrix(y_test,pred))
confusion_matrix
[[944 102]
 [161 200]]
In [107]:
print('mean_squared_error')
print(mean_squared_error(y_test,pred))
mean_squared_error
0.1869225302061123
In [108]:
print('roc_curve')
print(roc_curve(y_test,pred))
roc_curve
(array([0.        , 0.09751434, 1.        ]), array([0.        , 0.55401662, 1.        ]), array([2, 1, 0], dtype=int64))
In [109]:
print('roc_auc_score')
print(roc_auc_score(y_test,pred))
roc_auc_score
0.7282511400772234
In [110]:
from sklearn import metrics

XYZ = lr.predict_proba(x_test)[::,1]


fpr, tpr, _ = metrics.roc_curve(y_test,  XYZ)

#create ROC curve
plt.plot(fpr,tpr)
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
No description has been provided for this image
In [111]:
print('accuracy_score')
print(accuracy_score(y_test,pred))
accuracy_score
0.8130774697938877
In [112]:
print('mean_absolute_error')
print(mean_absolute_error(y_test,pred))
mean_absolute_error
0.1869225302061123
In [113]:
print('r2_score')
print(r2_score(y_test,pred))
r2_score
0.02003410962749541
In [114]:
print('mean_squared_error')
print(mean_squared_error(y_test,pred))
mean_squared_error
0.1869225302061123
In [115]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier()
In [116]:
model.fit(x_test,y_test)
Out[116]:
GradientBoostingClassifier()
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
GradientBoostingClassifier()
In [117]:
model.score(x_test,y_test)
Out[117]:
0.8869936034115139
In [118]:
y_predicted = model.predict(x_test)
from sklearn.metrics import confusion_matrix
In [119]:
CM = confusion_matrix(y_test,y_predicted)
CM
Out[119]:
array([[1004,   42],
       [ 117,  244]], dtype=int64)
In [120]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1046
           1       0.85      0.68      0.75       361

    accuracy                           0.89      1407
   macro avg       0.87      0.82      0.84      1407
weighted avg       0.88      0.89      0.88      1407

In [121]:
plt.figure(figsize = (5,3))

sns.heatmap(CM, annot = True)

plt.xlabel("Predicted")
plt.ylabel("Truth")
Out[121]:
Text(33.22222222222222, 0.5, 'Truth')
No description has been provided for this image
In [122]:
import tensorflow
from tensorflow import keras
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense
In [123]:
model = Sequential()
model.add(Dense(3,activation = 'sigmoid',input_dim =50))
model.add(Dense(1,activation = 'sigmoid'))
In [124]:
model.summary()
Model: "sequential"
┏━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━┓
┃ Layer (type)                         ┃ Output Shape                ┃         Param # ┃
┡━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━┩
│ dense (Dense)                        │ (None, 3)                   │             153 │
├──────────────────────────────────────┼─────────────────────────────┼─────────────────┤
│ dense_1 (Dense)                      │ (None, 1)                   │               4 │
└──────────────────────────────────────┴─────────────────────────────┴─────────────────┘
 Total params: 157 (628.00 B)
 Trainable params: 157 (628.00 B)
 Non-trainable params: 0 (0.00 B)

(503)+3 = 153 __ Nural Network__ (31)+1 =4

In [125]:
model.compile(loss = 'binary_crossentropy',optimizer = 'Adam')
In [126]:
history = model.fit(x_train,y_train, epochs = 10,validation_split = 0.2)
Epoch 1/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 3s 5ms/step - loss: 0.5621 - val_loss: 0.5353
Epoch 2/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.5435 - val_loss: 0.5085
Epoch 3/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.5249 - val_loss: 0.4995
Epoch 4/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.5088 - val_loss: 0.4964
Epoch 5/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.5063 - val_loss: 0.4822
Epoch 6/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.4996 - val_loss: 0.4748
Epoch 7/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.4954 - val_loss: 0.4680
Epoch 8/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.4954 - val_loss: 0.4767
Epoch 9/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step - loss: 0.4844 - val_loss: 0.4740
Epoch 10/10
141/141 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step - loss: 0.4943 - val_loss: 0.4747
In [127]:
history.history
Out[127]:
{'loss': [0.557719886302948,
  0.5357632040977478,
  0.5170425772666931,
  0.5098909139633179,
  0.5036367177963257,
  0.4997670650482178,
  0.49613064527511597,
  0.49850934743881226,
  0.4926097095012665,
  0.49803227186203003],
 'val_loss': [0.5352861881256104,
  0.5085173845291138,
  0.4994664192199707,
  0.4964390993118286,
  0.48221153020858765,
  0.474754273891449,
  0.46796777844429016,
  0.47672298550605774,
  0.47400644421577454,
  0.4747053384780884]}
In [128]:
model.layers[0].get_weights()
Out[128]:
[array([[-0.5463179 ,  0.05650653,  0.3650551 ],
        [ 0.03742829, -0.14135025,  0.13952711],
        [-0.06450991,  0.06037767, -0.00681495],
        [-0.17995971, -0.28102002, -0.1785467 ],
        [-0.00374536, -0.19732894, -0.4361244 ],
        [-0.04328292, -0.14842427,  0.11460909],
        [ 0.26719922,  0.03449143,  0.01766997],
        [ 0.3253568 ,  0.12997107,  0.17704494],
        [ 0.01427152, -0.04898315, -0.2847259 ],
        [ 0.10231312, -0.02309847, -0.4784282 ],
        [-0.22372206, -0.28719616, -0.2697603 ],
        [-0.09915258, -0.08608794,  0.08483568],
        [-0.23850799,  0.03314799, -0.17219527],
        [-0.04669595,  0.05355498, -0.07766202],
        [ 0.33629373, -0.02160749,  0.06255865],
        [ 0.26900995, -0.32996234, -0.09059858],
        [ 0.03602493, -0.20280282, -0.45306292],
        [-0.08447246,  0.21260592, -0.14637433],
        [ 0.1530685 ,  0.08943116, -0.3217084 ],
        [-0.02512531, -0.19206329,  0.04147688],
        [ 0.43579125, -0.17336755, -0.23017651],
        [-0.42269108, -0.07303046,  0.74278957],
        [-0.14707884, -0.04528806, -0.5965806 ],
        [-0.0638312 ,  0.18509586, -0.14188132],
        [ 0.0486447 , -0.06334215, -0.54371154],
        [ 0.14677532,  0.3282468 , -0.4293225 ],
        [-0.27745435, -0.3757026 ,  0.15246469],
        [-0.25772613,  0.32732505, -0.59927493],
        [ 0.1539736 ,  0.3636654 , -0.3323758 ],
        [-0.05427227,  0.19733638,  0.30169678],
        [ 0.02738924, -0.28589797, -0.20484433],
        [-0.48669294, -0.32840505, -0.05018048],
        [ 0.12742779,  0.1878976 ,  0.31367347],
        [ 0.34434706,  0.12863828, -0.1261132 ],
        [-0.09824098, -0.177224  , -0.629373  ],
        [ 0.1405677 , -0.24785283, -0.32302347],
        [ 0.31714106,  0.19785787, -0.32872766],
        [-0.40762943, -0.02202698, -0.01933847],
        [ 0.04765834,  0.15024097, -0.30880463],
        [-0.05138935,  0.02901016, -0.5496591 ],
        [ 0.02611189,  0.04118262,  0.0641803 ],
        [ 0.26618567,  0.13252485, -0.17799418],
        [ 0.43030825,  0.37173033, -0.8854663 ],
        [ 0.1212143 ,  0.14268833, -0.8258031 ],
        [ 0.25864804,  0.4541043 , -0.8002766 ],
        [-0.38748187, -0.08299001, -0.0490102 ],
        [ 0.02894197,  0.20624974, -0.4041168 ],
        [ 0.37240303,  0.1631575 , -0.11515518],
        [-0.48053962, -0.26066142,  0.41042686],
        [ 0.44895425,  0.01954965, -0.67026377]], dtype=float32),
 array([ 0.07838272, -0.00113499, -0.23922464], dtype=float32)]
In [129]:
model.layers[1].get_weights()
Out[129]:
[array([[-0.35099477],
        [-1.2909899 ],
        [ 1.4429581 ]], dtype=float32),
 array([-0.40447542], dtype=float32)]
In [130]:
model.predict(x_test)
44/44 ━━━━━━━━━━━━━━━━━━━━ 0s 3ms/step
Out[130]:
array([[0.15514916],
       [0.15505856],
       [0.23293753],
       ...,
       [0.15505846],
       [0.15505846],
       [0.1550587 ]], dtype=float32)

To convert in to "0" , "1" we have to give a threshhold value

In [131]:
y_log = model.predict(x_test)
44/44 ━━━━━━━━━━━━━━━━━━━━ 0s 2ms/step
In [132]:
np.where(y_log>0.5,1,0)
Out[132]:
array([[0],
       [0],
       [0],
       ...,
       [0],
       [0],
       [0]])
In [133]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_predicted)
Out[133]:
0.8869936034115139
In [134]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
Out[134]:
[<matplotlib.lines.Line2D at 0x250edc37910>]
No description has been provided for this image
In [135]:
print(classification_report(y_test,y_predicted))
              precision    recall  f1-score   support

           0       0.90      0.96      0.93      1046
           1       0.85      0.68      0.75       361

    accuracy                           0.89      1407
   macro avg       0.87      0.82      0.84      1407
weighted avg       0.88      0.89      0.88      1407

In [ ]: